# utils/convert_encoding.py
import os
from pathlib import Path

def is_gbk_encoded(file_path):
    """判断文件是否已经是 gbk 编码"""
    try:
        with open(file_path, 'r', encoding='gbk') as f:
            f.read()
        return True
    except UnicodeDecodeError:
        return False

for txt_file in Path("dataset/pdf_txt_file_my").glob("*.txt"):
    try:
        # 判断文件是否已经是 gbk 编码
        if is_gbk_encoded(txt_file):
            print(f"Skipped (already in gbk): {txt_file}")
            continue

        # 读取文件内容
        with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()

        # 检查内容是否可以用 gbk 编码
        try:
            content.encode('gbk')
        except UnicodeEncodeError:
            print(f"Skipped (encoding issue): {txt_file}")
            continue

        # 写入文件为 gbk 编码
        with open(txt_file, 'w', encoding='gbk') as f:
            f.write(content)
        print(f"Converted: {txt_file}")
    except Exception as e:
        print(f"Error processing {txt_file}: {e}")